library(nycflights13)
library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

5.5 Add new variables with mutate()

flights_sml <- select(flights, year:day, ends_with(“delay”), distance, air_time ) mutate(flights_sml, gain = dep_delay - arr_delay, speed = distance / air_time * 60 )

mutate(flights_sml, gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours )

5.5.1 Useful creation functions

  • Arithmetic operators: +, -, *, /, ^
  • Modular arithmetic: %/% (integer division) and %% (remainder), where x == y * (x %/% y) + (x %% y)

transmute(flights, dep_time, hour = dep_time %/% 100, minute = dep_time %% 100 )

  • Logs: log(), log2(), log10()
  • Offsets: lead() and lag() allow you to refer to leading or lagging values
  • Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: cumsum(), cumprod(), cummin(), cummax(); and dplyr provides cummean() for cumulative means
  • Logical comparisons, <, <=, >, >=, !=, and ==, which you learned about earlier
  • Ranking: there are a number of ranking functions, but you should start with min_rank()
  • If min_rank() doesn’t do what you need, look at the variants row_number(), dense_rank(), percent_rank(), cume_dist(), ntile()
flights

5.5.2 Exercises

1).

1504 %/% 100
## [1] 15
1504 %% 100
## [1] 4
1504 %/% 100 * 60 + 1504 %% 100
## [1] 904

flights_times <- mutate(flights, dep_time_mins = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440, sched_dep_time_mins = (sched_dep_time %/% 100 * 60 + sched_dep_time %% 100) %% 1440 ) # view only relevant columns select( flights_times, dep_time, dep_time_mins, sched_dep_time, sched_dep_time_mins )

We could define a function time2mins(), which converts a vector of times in from the format used in flights to minutes since midnight.

2).

air_time is the difference between the arrival (arr_time) and departure times (dep_time). In other words, air_time = arr_time - dep_time.

  • The flight passes midnight, so arr_time < dep_time. In these cases, the difference in airtime should be by 24 hours (1,440 minutes).

  • The flight crosses time zones, and the total air time will be off by hours (multiples of 60).

  • Given the time-zones in the US, the differences due to time-zone should be 60 minutes (Central) 120 minutes (Mountain), 180 minutes (Pacific), 240 minutes (Alaska), or 300 minutes (Hawaii).

3). the departure delay (dep_delay) to be equal to the difference between scheduled departure time (sched_dep_time), and actual departure time (dep_time), dep_time - sched_dep_time = dep_delay.

4).

flights_delayed <- mutate(flights,
  dep_delay_min_rank = min_rank(desc(dep_delay)),
  dep_delay_row_number = row_number(desc(dep_delay)),
  dep_delay_dense_rank = dense_rank(desc(dep_delay))
)
flights_delayed <- filter(
  flights_delayed,
  !(dep_delay_min_rank > 10 | dep_delay_row_number > 10 |
    dep_delay_dense_rank > 10)
)
flights_delayed <- arrange(flights_delayed, dep_delay_min_rank)
print(select(
  flights_delayed, month, day, carrier, flight, dep_delay,
  dep_delay_min_rank, dep_delay_row_number, dep_delay_dense_rank
),
n = Inf
)
## # A tibble: 10 x 8
##    month   day carrier flight dep_delay dep_delay_min_r… dep_delay_row_n…
##    <int> <int> <chr>    <int>     <dbl>            <int>            <int>
##  1     1     9 HA          51      1301                1                1
##  2     6    15 MQ        3535      1137                2                2
##  3     1    10 MQ        3695      1126                3                3
##  4     9    20 AA         177      1014                4                4
##  5     7    22 MQ        3075      1005                5                5
##  6     4    10 DL        2391       960                6                6
##  7     3    17 DL        2119       911                7                7
##  8     6    27 DL        2007       899                8                8
##  9     7    22 DL        2047       898                9                9
## 10    12     5 AA         172       896               10               10
## # … with 1 more variable: dep_delay_dense_rank <int>
  • dplyr package provides multiple functions for ranking, which differ in how they handle tied values: row_number(), min_rank(), dense_rank()

  • row_number() assigns each element a unique value

  • min_rank() and dense_rank() assign tied values the same rank, but differ in how they assign values to the next rank

5).

c(1 + 1, 2 + 2, 3 + 3, 1 + 4, 2 + 5, 3 + 6, 1 + 7, 2 + 8, 3 + 9, 1 + 10)
##  [1]  2  4  6  5  7  9  8 10 12 11

6).

  • R provides functions for the three primary trigonometric functions: sine (sin())), cosine (cos()), and tangent (tan())

5.6 Grouped summaries with summarise()

5.6.1 Combining multiple operations with the pipe

by_dest <- group_by(flights, dest) delay <- summarise(by_dest, count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) delay <- filter(delay, count > 20, dest != “HNL”)

ggplot(data = delay, mapping = aes(x = dist, y = delay)) + geom_point(aes(size = count), alpha = 1/3) + geom_smooth(se = FALSE)

There are three steps to prepare this data:

  • Group flights by destination.

  • Summarise to compute distance, average delay, and number of flights.

  • Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport.

5.6.2 Missing values

flights %>% group_by(year, month, day) %>% summarise(mean = mean(dep_delay))

  • all aggregation functions have an na.rm argument which removes the missing values prior to computation

5.6.3 Counts

delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay) )

ggplot(data = delays, mapping = aes(x = delay)) + geom_freqpoly(binwidth = 10)

delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay, na.rm = TRUE), n = n() )

ggplot(data = delays, mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)

5.6.4 Useful summary functions

  • Measures of location: we’ve used mean(x), but median(x) is also useful.
  • The mean is the sum divided by the length; the median is a value where 50% of x is above it, and 50% is below it.

  • Measures of spread: sd(x), IQR(x), mad(x).
  • The root mean squared deviation, or standard deviation sd(x), is the standard measure of spread.
  • The interquartile range IQR(x) and median absolute deviation mad(x) are robust equivalents that may be more useful if you have outliers.

  • Measures of rank: min(x), quantile(x, 0.25), max(x). Quantiles are a generalisation of the median.
    • For example, quantile(x, 0.25) will find a value of x that is greater than 25% of the values, and less than the remaining 75%.
  • Measures of position: first(x), nth(x, 2), last(x).
    • These work similarly to x[1], x[2], and x[length(x)] but let you set a default value if that position does not exist (i.e. you’re trying to get the 3rd element from a group that only has two elements).
  • Counts: You’ve seen n(), which takes no arguments, and returns the size of the current group.
  • To count the number of non-missing values, use sum(!is.na(x)).
  • To count the number of distinct (unique) values, use n_distinct(x)

  • Counts and proportions of logical values: sum(x > 10), mean(y == 0).
  • When used with numeric functions, TRUE is converted to 1 and FALSE to 0.
  • This makes sum() and mean() very useful: sum(x) gives the number of TRUEs in x, and mean(x) gives the proportion.

5.6.5 Grouping by multiple variables

daily <- group_by(flights, year, month, day) (per_day <- summarise(daily, flights = n()))

(per_month <- summarise(per_day, flights = sum(flights)))

(per_year <- summarise(per_month, flights = sum(flights)))

5.6.6 Ungrouping

  • to remove grouping, and return to operations on ungrouped data, use ungroup(

5.6.7 Exercises

1). If a flight is always 30 minutes late and that delay is known, then it is as if the arrival time is that delayed time.

2). not_cancelled %>% group_by(dest) %>% summarise(n = length(dest))

3).

filter(flights, !is.na(dep_delay), is.na(arr_delay)) %>% select(dep_time, arr_time, sched_arr_time, dep_delay, arr_delay) - most important column is arr_delay, which indicates the amount of delay in arrival.

4). cancelled_and_delays <- flights %>% mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>% group_by(year, month, day) %>% summarise( cancelled_prop = mean(cancelled), avg_dep_delay = mean(dep_delay, na.rm = TRUE), avg_arr_delay = mean(arr_delay, na.rm = TRUE) ) %>%

ggplot(cancelled_and_delays) + geom_point(aes(x = avg_arr_delay, y = cancelled_prop))

5).

flights %>%
  group_by(carrier) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(arr_delay))

6). The sort argument to count() sorts the results in order of n. You could use this anytime you would run count() followed by arrange()

5.7.1 Exercises

1).

2).

flights %>%
  filter(!is.na(tailnum)) %>%
  mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
  group_by(tailnum) %>%
  summarise(on_time = mean(on_time), n = n()) %>%
  filter(min_rank(on_time) == 1)

3).

flights %>%
  group_by(hour) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(arr_delay)

4).

cancelled_and_delays <-
  flights %>%
  mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
  group_by(year, month, day) %>%
  summarise(
    cancelled_prop = mean(cancelled),
    avg_dep_delay = mean(dep_delay, na.rm = TRUE),
    avg_arr_delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  ungroup()
flights %>%
  filter(arr_delay > 0) %>%
  group_by(dest) %>%
  mutate(
    arr_delay_total = sum(arr_delay),
    arr_delay_prop = arr_delay / arr_delay_total
  ) %>%
  select(
    dest, month, day, dep_time, carrier, flight,
    arr_delay, arr_delay_prop
  ) %>%
  arrange(dest, desc(arr_delay_prop))

5). After about 8-hours, a delayed flight is likely to be followed by a flight leaving on time.

lagged_delays <- flights %>%
  arrange(origin, month, day, dep_time) %>%
  group_by(origin) %>%
  mutate(dep_delay_lag = lag(dep_delay)) %>%
  filter(!is.na(dep_delay), !is.na(dep_delay_lag))
lagged_delays %>%
  group_by(dep_delay_lag) %>%
  summarise(dep_delay_mean = mean(dep_delay)) %>%
  ggplot(aes(y = dep_delay_mean, x = dep_delay_lag)) +
  geom_point() +
  scale_x_continuous(breaks = seq(0, 1500, by = 120)) +
  labs(y = "Departure Delay", x = "Previous Departure Delay")
lagged_delays %>%
  group_by(origin, dep_delay_lag) %>%
  summarise(dep_delay_mean = mean(dep_delay)) %>%
  ggplot(aes(y = dep_delay_mean, x = dep_delay_lag)) +
  geom_point() +
  facet_wrap(~origin, ncol = 1) +
  labs(y = "Departure Delay", x = "Previous Departure Delay")
flights %>%
  mutate(mph = distance / (air_time / 60)) %>%
  arrange(desc(mph)) %>%
  select(
    origin, dest, mph, year, month, day, dep_time, flight, carrier,
    dep_delay, arr_delay
  )

7).

flights %>%
  # find all airports with > 1 carrier
  group_by(dest) %>%
  mutate(n_carriers = n_distinct(carrier)) %>%
  filter(n_carriers > 1) %>%
  # rank carriers by numer of destinations
  group_by(carrier) %>%
  summarize(n_dest = n_distinct(dest)) %>%
  arrange(desc(n_dest))
filter(airlines, carrier == "EV")
filter(airlines, carrier %in% c("AS", "F9", "HA"))

8).The exception is flights on the days on which daylight savings started (March 10) or ended (November 3).

flights %>%
  select(tailnum, year, month, day, dep_delay) %>%
  filter(!is.na(dep_delay)) %>%
  arrange(tailnum, year, month, day) %>%
  group_by(tailnum) %>%
  mutate(cumulative_hr_delays = cumsum(dep_delay > 60)) %>%
  summarise(total_flights = sum(cumulative_hr_delays < 1)) %>%
  arrange(total_flights)